Hospitalizations by State
- Covid Tracking Dataset (w/ hospitalised data)
- An Exploratory data analysis of the US dataset
- Basic triad of the dataset: validating data types and data integrity of each row
- Graphical Exploratory Analysis
- Analysis of Hospitalizations by State
- Since the normality of the independent variables is highly variable do to temporal and precision differences from each state, we further assess each state's data by viewing trends on the independent variables in both scatter and box-and-whisker plots.
- Alabama
- Arizona
- Arkansas
- California
- Colorado
- Connecticut
- Delaware
- Florida
- Georgia
- Hawaii
- Idaho
- Iowa
- Kansas
- Kentucky
- Louisiana
- Maine
- Maryland
- Massachusetts
- Michigan
- Minnesota
- Mississippi
- Missouri
- Montana
- Nebraska
- Nevada:
- New Hampshire
- New Jersey
- New Mexico
- New York
- North Carolina
- Ohio
- Oklahoma
- Oregon
- Pennsylvania
- Rhode Island
- South Carolina
- South Dakota
- Tennessee
- Texas
- Utah
- Vermont
- Virginia
- Washington
- West Virginia
- Wisconsin
- Wyoming
- Assessing Correlation of Independent Variables
- Build model for dependent Variable
#collapse-hide
# Imports
import os
import pandas as pd
import csv
import kaggle
# other imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from copy import copy
import seaborn as sns
from scipy.stats import norm
import matplotlib.dates as mdates
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
from datetime import date, datetime
from dateutil.parser import parse
import us
# import operator
# plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline
Covid Tracking Dataset (w/ hospitalised data)
Source: https://covidtracking.com/
#collapse-hide
all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')
# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
del all_cases[row]
# TODO missing values
# Do we get avg or missing values, or predict them?
# See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72
for i, row in all_cases.iterrows():
# Set Dates
s = str(row['date'])
all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))
# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
if np.isnan(row['death']):
all_cases.at[i, 'death'] = 0
#collapse-hide
# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]
#collapse-hide
# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')
# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
# store state abbreviation in variable
abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
# add abbrev to new column 'Abbreviation' in us_population df
us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
# add abbrev to new column in hosp_beds df
hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]
# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])
# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])
# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]
us_population.head()
hosp_beds.head()
#collapse-hide
# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]
# see what filtered dataframe looks like
all_cases.head()
#collapse-hide
# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))
# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))
# merge dataframes us_population and all_cases
df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})
# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]
# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})
all_cases.head()
#collapse-hide
# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']
# change abbreviations to state names
all_cases = all_cases.rename(columns={'state': 'abbrev'})
all_cases = all_cases.rename(columns={'State': 'state'})
# change location of 'state' column
cols = list(all_cases)
cols.insert(1, cols.pop(cols.index('state')))
all_cases = all_cases.loc[:, cols]
all_cases.head()
- Load and clean JHU data
- Merge JHU dataset with main dataset
#collapse-hide
# This cell takes some time, as it needs to connect to Kaggle Servers to retrieve data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)
#collapse-hide
# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'
df_list = []
for dirname, _, files in os.walk(dir_jhu):
for file in files:
if 'gitignore' not in file and 'README' not in file:
full_dir = os.path.join(dirname, file)
df_list.append(pd.read_csv(full_dir))
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)
# convert Last Update columns to datetime format
jhu_df.loc[:, 'Last Update'] = pd.to_datetime(jhu_df['Last Update']).apply(lambda x: x.date())
jhu_df.loc[:, 'Last_Update'] = pd.to_datetime(jhu_df['Last_Update']).apply(lambda x: x.date())
# Combine Last Update with Last_Update
jhu_df['LastUpdate'] = jhu_df['Last_Update'].combine_first(jhu_df['Last Update'])
# Combine Country/Region with Country_Region
jhu_df['CountryRegion'] = jhu_df['Country/Region'].combine_first(jhu_df['Country_Region'])
# Retrieve only US data
jhu_df = jhu_df[jhu_df['CountryRegion']=='US']
# Combine Province/State with Province_State
jhu_df['ProvinceState'] = jhu_df['Province/State'].combine_first(jhu_df['Province_State'])
# Drop unnecessary columns
jhu_df = jhu_df.drop(['Admin2', 'Lat', 'Latitude', 'Long_', 'Longitude', 'Combined_Key', 'Country/Region',
'Country_Region', 'Province/State', 'Province_State',
'Last Update', 'Last_Update', 'FIPS'], axis=1)
# Change column order
cols = list(jhu_df)
cols.insert(0, cols.pop(cols.index('CountryRegion')))
cols.insert(1, cols.pop(cols.index('ProvinceState')))
cols.insert(2, cols.pop(cols.index('LastUpdate')))
jhu_df = jhu_df.loc[:, cols]
# Change region to known US states
state_abbrs_dict = {}
for state in us.states.STATES:
state_abbrs_dict[state.abbr] = state.name
def toState(input_state, mapping):
abbreviation = input_state.rstrip()[-2:]
try:
return_value = mapping[abbreviation]
except KeyError:
return_value = input_state
return return_value
jhu_df['ProvinceState'] = jhu_df['ProvinceState'].apply(lambda x: toState(x, state_abbrs_dict) if x != 'Washington, D.C.' else 'District of Columbia')
# Filter out unknown states
jhu_df = jhu_df[jhu_df['ProvinceState'].isin(all_cases.state.unique().tolist())]
# Merge-sum rows with same date and State
jhu_df = jhu_df.groupby(['LastUpdate', 'ProvinceState']).agg(
{
'Active': sum,
'Confirmed': sum,
'Deaths': sum,
'Recovered': sum
}
).reset_index()
jhu_df.tail()
#collapse-hide
# Now that we have the JHU dataset relatively cleaned
# we can go ahead and merge its data with our main dataset
for i, row in all_cases.iterrows():
last_update = all_cases.at[i, 'date']
state = all_cases.at[i, 'state']
matching_row = jhu_df[jhu_df['ProvinceState'] == state]
matching_row = matching_row[matching_row['LastUpdate'] == last_update].reset_index()
if len(matching_row.values) > 0:
#all_cases.at[i, 'positive'] = matching_row['Confirmed'].values[0]
all_cases.at[i, 'active'] = matching_row['Active'].values[0]
#all_cases.at[i, 'recovered'] = matching_row['Recovered'].values[0] --- JHU was inconsistent, therefore removed
#all_cases.at[i, 'death'] = matching_row['Deaths'].values[0]
# Replace unknown recovery numbers with 0
if np.isnan(row['recovered']):
all_cases.at[i, 'recovered'] = 0
if all_cases.at[i, 'active'] == 0 or np.isnan(row['active']):
positive = all_cases.at[i, 'positive']
recovered = all_cases.at[i, 'recovered']
dead = all_cases.at[i, 'death']
all_cases.at[i, 'active'] = positive - recovered - dead
all_cases.tail()
#collapse-hide
# Save formatted dataset offline in case of disaster
dataset_file = 'results/all_cases.csv'
all_cases.to_csv(dataset_file)
#collapse-hide
# convert date to datetime format
all_cases['date'] = pd.to_datetime(all_cases['date'])
#collapse-hide
dataset_file = 'results/all_cases.csv'
covid_df = pd.read_csv(dataset_file, index_col=0)
# convert date to datetime format
covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df.info()
# set float format to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)
covid_df.head()
The NaN values may indicate that there were no to few Covid-19 patients at these date points. We further analyse the statistical values of the dataset columns to ensure data integrity and accuracy.
covid_df.describe()
# TODO rounding up the numbers
#collapse-hide
# drop unnecessary columns
covid_cleaned = covid_df.drop(['hospitalized', 'bedsPerThousand'], axis=1)
covid_100k = covid_cleaned.copy()
# list of columns to transform to per 100k
columns_list = ['positive', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']
# add columns per 100k
for column in columns_list:
if column == 'total_beds':
covid_100k['BedsPer100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
else:
covid_100k['{}_100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
covid_100k = covid_100k.drop(columns_list, axis=1)
#collapse-hide
covid_100k['date'] = pd.to_datetime(covid_100k['date'])
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_100k['date'] > start_date) & (covid_100k['date'] <= end_date)
covid_100k_last_month = covid_100k.loc[mask]
#collapse-hide
covid_100k_last_month_part1 = covid_100k_last_month.groupby('date').sum().loc[:, ['positive_100k','active_100k','recovered_100k','death_100k','hospitalizedCumulative_100k']].diff(periods=1, axis=0)
covid_100k_last_month_part2 = covid_100k_last_month.groupby('date').sum().loc[:, ['inIcuCurrently_100k','onVentilatorCurrently_100k','BedsPer100k']]
final_100k_last_month = covid_100k_last_month_part1.merge(covid_100k_last_month_part2, left_index=True, right_index=True)
final_100k_last_month.head()
final_100k_last_month.describe()
#collapse-hide
# save description cleaned dataset to csv
describe_file = 'results/final_100k_last_month.csv'
final_100k_last_month.describe().to_csv(describe_file)
#collapse-hide
# Omitting the categorical (states/abbreviations) and time columns
# There must be an easier way for you, but this was the easiest way I could think of
covid_cleaned['date'] = pd.to_datetime(covid_cleaned['date'])
# mask data for last month
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_cleaned['date'] > start_date) & (covid_cleaned['date'] <= end_date)
covid_cleaned_last_month = covid_cleaned.loc[mask]
plot_df = covid_cleaned_last_month[['population', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']]
plot_df_last_month = covid_100k_last_month[['population', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']]
#collapse-hide
timeseries_usa_df = covid_100k.loc[:, ['date', 'positive_100k', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']].groupby('date').sum().reset_index()
# timeseries_usa_df['log_positive'] = np.log(timeseries_usa_df['positive_100k'])
# timeseries_usa_df['log_active'] = np.log(timeseries_usa_df['active_100k'])
# timeseries_usa_df['log_recovered'] = np.log(timeseries_usa_df['recovered_100k'])
# timeseries_usa_df['log_death'] = np.log(timeseries_usa_df['death_100k'])
timeseries_usa_df.tail()
#collapse-hide
# get data from last day
# plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18']
# Plotting histograms to gain insight of the distribution shape, skewness and scale
fig, axs = plt.subplots(4,2,figsize = (16, 16))
sns.set()
for i, column in enumerate(plot_df_last_month.columns):
if (i + 1) % 2 == 0:
ax = axs[(i//2), 1]
else:
ax = axs[(i//2), 0]
sns.distplot(plot_df_last_month[column], fit=norm, fit_kws=dict(label='normality'), hist_kws=dict(color='plum', edgecolor='k', linewidth=1, label='frequency'), ax=ax, color='#9d53ad')
ax.legend(loc='upper right')
plt.tight_layout()
fig.subplots_adjust(top=0.95)
#collapse-hide
# Looking at linearity and variance with scatterplots
# Removing the target variable and saving it in another df
target = plot_df.hospitalizedCurrently
indep_var = plot_df.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var.columns):
ax=fig.add_subplot(4, 3, i+1)
sns.regplot(x=indep_var[col], y=target, data=indep_var, label=col, scatter_kws={'s':10}, line_kws={"color": "plum", 'label': 'hospitCurr'})
plt.suptitle('Scatterplots with Target Hospitalized Patients Showing Growth Trajectories', fontsize=23)
plt.legend()
plt.tight_layout()
fig.subplots_adjust(top=0.95)
#collapse-hide
# Assessing the normality of the distribution with a boxplot
# Boxplot with removed outliers
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df.columns):
ax=fig.add_subplot(4, 3, i+1)
sns.boxplot(x=plot_df[col], data=plot_df, color='lightblue', showfliers=False)
plt.suptitle('Boxplots of Independent Variables', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
#collapse-hide
# get data from last day
plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18']
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df_last_date.columns):
ax=fig.add_subplot(4, 3, i+1)
sns.boxplot(x=plot_df_last_date[col], data=plot_df, color='lightblue', showfliers=True)
plt.suptitle('Boxplots of Independent Variables', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(fl.date, fl.positiveTestsViral, linewidth=4.7, color='r')
plt.title('Cummulative Number of Positive Viral Tests in Florida', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')
# TODO add some explanation / look more into collinear variables
# Heatmap of correlations
# Save correlations to variable
corr = covid_cleaned.corr(method='pearson')
# We can create a mask to not show duplicate values
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(16,16))
# Generate heatmap
sns.heatmap(corr, annot=True, mask=mask, cmap='GnBu', center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
# We compare three models:
# - Polynomial Regression
# - Linear Regression
# - ElasticNet
# Copy DFs to not mess up original one
# We will use model_df for our regression model
model_df = all_cases.copy()
# Delete redundant rows
for row in ['abbrev', 'bedsPerThousand', 'hospitalized',
'state', 'hospitalizedCumulative', 'dataQualityGrade', 'lastUpdateEt']:
del model_df[row]
# Drop NaN values for hospitalizedCurrently
model_df = model_df.dropna(subset=['hospitalizedCurrently'])
# Drop Values with abnormal active-hospitalised ratios (outside Conf. Interval)
model_df['ratio_hospital'] = model_df['hospitalizedCurrently'] / model_df['active']
model_df = model_df[~(model_df['ratio_hospital'] >= model_df.ratio_hospital.quantile(0.99))]
#model_df = model_df[~(model_df['ratio_hospital'] <= model_df['ratio_hospital'].median())]
del model_df['ratio_hospital']
# Get peek of model to use
model_df.describe()
###@rygomez